# Load integrated dataset with IGO information
integrated_data <- read.csv("integrated_dataset_with_igo.csv", stringsAsFactors = FALSE)
# Load network status dataset with pre-calculated PCAs
network_data <- read.csv("all_network_status_pca.csv", stringsAsFactors = FALSE)
# Check dimensions
dim(integrated_data)
## [1] 8276 41
dim(network_data)
## [1] 2202 18
prepare the data for the attribute-based PCA. 1. Convert any non-numeric values to NA 2. Select appropriate variables for the PCA (excluding cinc as specified) 3. Handle missing values
# Convert "NA" strings to actual NA values
integrated_data <- integrated_data %>%
mutate(across(everything(), ~ifelse(. == "NA", NA, .)))
# Convert columns to appropriate types
integrated_data <- integrated_data %>%
mutate(across(c(milex, milper, irst, pec, tpop, upop,
full_memberships, associate_memberships, observer_statuses,
total_memberships, prestigious_memberships, regional_memberships,
global_memberships, security_memberships, economic_memberships,
political_memberships),
~as.numeric(.)))
# Convert political variables
integrated_data <- integrated_data %>%
mutate(across(c(democ, autoc, polity, polity2, xconst), ~as.numeric(.)))
# Let's look at missing values
missing_vals <- integrated_data %>%
select(milex, milper, irst, pec, tpop, upop,
democ, autoc, polity, polity2, xconst,
full_memberships, associate_memberships, observer_statuses,
prestigious_memberships, regional_memberships, global_memberships,
security_memberships, economic_memberships, political_memberships,
institutional_status, functional_balance) %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
gather(key = "variable", value = "missing_count")
# Display variables with missing values
missing_vals %>%
arrange(desc(missing_count)) %>%
mutate(percentage = missing_count / nrow(integrated_data) * 100) %>%
filter(missing_count > 0)
## variable missing_count percentage
## 1 full_memberships 398 4.8090865
## 2 associate_memberships 398 4.8090865
## 3 observer_statuses 398 4.8090865
## 4 prestigious_memberships 398 4.8090865
## 5 regional_memberships 398 4.8090865
## 6 global_memberships 398 4.8090865
## 7 security_memberships 398 4.8090865
## 8 economic_memberships 398 4.8090865
## 9 political_memberships 398 4.8090865
## 10 institutional_status 398 4.8090865
## 11 functional_balance 398 4.8090865
## 12 polity2 173 2.0903818
## 13 democ 82 0.9908168
## 14 autoc 82 0.9908168
## 15 polity 82 0.9908168
## 16 xconst 82 0.9908168
Since the network_data has data only for specific years, we’ll subset the integrated_data for those years:
# Extract years from network data
network_years <- unique(network_data$year)
# Filter integrated data for those years
integrated_subset <- integrated_data %>%
filter(Year %in% network_years)
# Check dimensions
dim(integrated_subset)
## [1] 1718 41
variables related 1. Material capabilities (milex, milper, pec, tpop, upop) 2. Institutional memberships (membership variables) 3. Political indicators (democ, polity2, xconst)
# Select variables for PCA
# We'll avoid using cinc as specified
pca_vars <- c("milex", "milper", "irst", "pec", "tpop", "upop",
"full_memberships", "prestigious_memberships",
"regional_memberships", "global_memberships",
"security_memberships", "economic_memberships", "political_memberships",
"polity2", "xconst")
# Create a dataframe with only the selected variables and complete cases
pca_data <- integrated_subset %>%
select(Destination, Year, all_of(pca_vars)) %>%
na.omit()
# Check dimensions
dim(pca_data)
## [1] 1505 17
# Scale the data before PCA
pca_data_scaled <- pca_data %>%
select(all_of(pca_vars)) %>%
scale()
# Perform PCA
pca_result <- PCA(pca_data_scaled, graph = FALSE)
# Examine eigenvalues to determine number of components to retain
fviz_eig(pca_result, addlabels = TRUE)
# Visualize variable contributions to first two dimensions
fviz_pca_var(pca_result, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
# Generate scree plot to determine how many components to retain
fviz_screeplot(pca_result, addlabels = TRUE)
# Print summary of PCA
summary(pca_result)
##
## Call:
## PCA(X = pca_data_scaled, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 5.257 3.621 1.166 1.010 0.794 0.700 0.583
## % of var. 35.049 24.140 7.773 6.734 5.296 4.667 3.889
## Cumulative % of var. 35.049 59.189 66.962 73.696 78.992 83.658 87.547
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 0.489 0.462 0.400 0.238 0.156 0.080 0.044
## % of var. 3.257 3.081 2.668 1.584 1.039 0.533 0.291
## Cumulative % of var. 90.804 93.885 96.553 98.137 99.176 99.709 100.000
## Dim.15
## Variance 0.000
## % of var. 0.000
## Cumulative % of var. 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 3.483 | -2.770 0.097 0.632 | 1.354 0.034 0.151
## 2 | 3.132 | -2.508 0.080 0.641 | 1.231 0.028 0.155
## 3 | 2.899 | -2.341 0.069 0.652 | 1.127 0.023 0.151
## 4 | 2.847 | -2.289 0.066 0.646 | 1.161 0.025 0.166
## 7 | 2.702 | -2.175 0.060 0.648 | 1.024 0.019 0.144
## 8 | 5.742 | -2.172 0.060 0.143 | 0.814 0.012 0.020
## 9 | 2.572 | -1.812 0.041 0.496 | 1.307 0.031 0.258
## 13 | 4.727 | -3.331 0.140 0.497 | 1.994 0.073 0.178
## 14 | 4.792 | -3.384 0.145 0.499 | 2.063 0.078 0.185
## 15 | 4.689 | -3.868 0.189 0.680 | 2.189 0.088 0.218
## Dim.3 ctr cos2
## 1 | 0.230 0.003 0.004 |
## 2 | 0.169 0.002 0.003 |
## 3 | 0.092 0.000 0.001 |
## 4 | 0.051 0.000 0.000 |
## 7 | -0.005 0.000 0.000 |
## 8 | -0.436 0.011 0.006 |
## 9 | -0.181 0.002 0.005 |
## 13 | 1.739 0.172 0.135 |
## 14 | 1.763 0.177 0.135 |
## 15 | 0.847 0.041 0.033 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## milex | 0.518 5.112 0.269 | 0.339 3.169 0.115 | 0.609
## milper | 0.439 3.661 0.192 | 0.717 14.210 0.515 | -0.080
## irst | 0.561 5.995 0.315 | 0.596 9.812 0.355 | -0.004
## pec | 0.676 8.694 0.457 | 0.624 10.738 0.389 | 0.276
## tpop | 0.466 4.128 0.217 | 0.714 14.066 0.509 | -0.397
## upop | 0.566 6.098 0.321 | 0.698 13.461 0.487 | -0.280
## full_memberships | 0.817 12.687 0.667 | -0.424 4.967 0.180 | -0.168
## prestigious_memberships | 0.768 11.228 0.590 | -0.465 5.970 0.216 | -0.156
## regional_memberships | 0.624 7.395 0.389 | -0.303 2.542 0.092 | 0.022
## global_memberships | 0.810 12.493 0.657 | -0.422 4.908 0.178 | -0.172
## ctr cos2
## milex 31.841 0.371 |
## milper 0.546 0.006 |
## irst 0.001 0.000 |
## pec 6.540 0.076 |
## tpop 13.528 0.158 |
## upop 6.720 0.078 |
## full_memberships 2.411 0.028 |
## prestigious_memberships 2.097 0.024 |
## regional_memberships 0.042 0.000 |
## global_memberships 2.538 0.030 |
Based on the scree plot and eigenvalues, this part will extract the first component as the attribute-based status score:
# Extract PCA scores for the first component
pca_scores <- as.data.frame(pca_result$ind$coord)
# Create a dataframe with country, year, and PCA score
attribute_scores <- cbind(
pca_data %>% select(Destination, Year),
attribute_status_pca = pca_scores[, 1] # First principal component
)
# View the first few rows
head(attribute_scores)
## Destination Year attribute_status_pca
## 1 Afghanistan 1960 -2.769927
## 2 Afghanistan 1965 -2.508063
## 3 Afghanistan 1970 -2.340794
## 4 Afghanistan 1975 -2.289075
## 7 Afghanistan 1990 -2.175058
## 8 Afghanistan 1995 -2.172125
merge the attribute-based PCA scores with the network status data:
# Merge the datasets based on country and year
integrated_status <- network_data %>%
inner_join(attribute_scores, by = c("country" = "Destination", "year" = "Year"))
# Check the dimensions
dim(integrated_status)
## [1] 1505 19
create an overall status score by combining the attribute-based and network-based measures:
# Standardize both PCA scores
integrated_status <- integrated_status %>%
mutate(
attribute_status_pca_z = scale(attribute_status_pca)[,1],
overall_status_network_pca_z = scale(overall_status_network_pca)[,1]
)
# Create a combined status score (simple average)
integrated_status <- integrated_status %>%
mutate(combined_status_score = (attribute_status_pca_z + overall_status_network_pca_z) / 2)
# Create status quartiles
integrated_status <- integrated_status %>%
group_by(year) %>%
mutate(
attribute_status_quartile = ntile(attribute_status_pca, 4),
network_status_quartile = ntile(overall_status_network_pca, 4),
combined_status_quartile = ntile(combined_status_score, 4)
) %>%
ungroup()
# Create status types
integrated_status <- integrated_status %>%
mutate(
status_type = case_when(
combined_status_quartile == 4 ~ "High Status",
combined_status_quartile == 3 ~ "Upper-Middle Status",
combined_status_quartile == 2 ~ "Lower-Middle Status",
combined_status_quartile == 1 ~ "Low Status"
)
)
# Calculate status inconsistency (difference between attribute and network status)
integrated_status <- integrated_status %>%
mutate(status_inconsistency = abs(attribute_status_pca_z - overall_status_network_pca_z))
# View the final integrated dataset
head(integrated_status)
## # A tibble: 6 × 27
## country recognition_count weighted_recognition eigenvector_centrality pagerank
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Czecho… 39 49.6 0.528 0.0125
## 2 Egypt 58 60.2 0.710 0.0196
## 3 France 82 75.5 1 0.0379
## 4 India 55 43.8 0.660 0.0205
## 5 Indone… 33 31.8 0.490 0.0114
## 6 Iran 36 31.5 0.540 0.0114
## # ℹ 22 more variables: authority <dbl>, betweenness <dbl>,
## # recognition_rate <dbl>, network_inconsistency <dbl>, outgoing_ties <int>,
## # recognition_balance <int>, recognition_ratio <dbl>,
## # recognition_status_pca <dbl>, prestige_status_pca <dbl>,
## # brokerage_status_pca <dbl>, overall_status_network_pca <dbl>, year <int>,
## # external_internal_ratio <dbl>, attribute_status_pca <dbl>,
## # attribute_status_pca_z <dbl>, overall_status_network_pca_z <dbl>, …
creating visualizations to explore the status dimensions:
# Create a scatter plot of attribute status vs. network status
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca)) +
geom_point(aes(color = status_type)) +
geom_text_repel(data = subset(integrated_status,
attribute_status_pca > quantile(attribute_status_pca, 0.9) |
overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
aes(label = country), max.overlaps = 15) +
facet_wrap(~ year) +
labs(x = "Attribute-Based Status", y = "Network-Based Status",
title = "Status Dimensions by Year",
color = "Status Type") +
theme_minimal()
perform cluster analysis to identify status groups:
# Prepare data for clustering
cluster_data <- integrated_status %>%
select(attribute_status_pca, overall_status_network_pca)
# Scale the data
cluster_data_scaled <- scale(cluster_data)
# Determine optimal number of clusters
fviz_nbclust(cluster_data_scaled, kmeans, method = "silhouette")
fviz_nbclust(cluster_data_scaled, kmeans, method = "wss")
# Perform k-means clustering with the optimal number of clusters (from plots above)
# Let's assume the optimal number is 4 based on the previous plots
k <- 4
kmeans_result <- kmeans(cluster_data_scaled, centers = k, nstart = 25)
# Add cluster assignments to the data
integrated_status$cluster <- as.factor(kmeans_result$cluster)
# Visualize the clusters
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca, color = cluster)) +
geom_point(alpha = 0.7) +
geom_text_repel(data = subset(integrated_status,
attribute_status_pca > quantile(attribute_status_pca, 0.9) |
overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
aes(label = country), max.overlaps = 15) +
facet_wrap(~ year) +
labs(x = "Attribute-Based Status", y = "Network-Based Status",
title = "Status Clusters by Year") +
theme_minimal()
# Compare cluster assignments with status quartiles
table(integrated_status$cluster, integrated_status$combined_status_quartile)
##
## 1 2 3 4
## 1 380 349 44 0
## 2 0 0 0 15
## 3 0 29 332 158
## 4 0 0 0 198
hierarchical clustering:
# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled)
# Perform hierarchical clustering
hc_result <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
plot(hc_result, main = "Hierarchical Clustering Dendrogram",
xlab = "", sub = "", labels = FALSE)
# Cut the dendrogram to create the desired number of clusters
hc_clusters <- cutree(hc_result, k = 4)
# Add hierarchical cluster assignments to the data
integrated_status$hc_cluster <- as.factor(hc_clusters)
# Visualize the hierarchical clusters
ggplot(integrated_status, aes(x = attribute_status_pca, y = overall_status_network_pca, color = hc_cluster)) +
geom_point(alpha = 0.7) +
geom_text_repel(data = subset(integrated_status,
attribute_status_pca > quantile(attribute_status_pca, 0.9) |
overall_status_network_pca > quantile(overall_status_network_pca, 0.9)),
aes(label = country), max.overlaps = 15) +
facet_wrap(~ year) +
labs(x = "Attribute-Based Status", y = "Network-Based Status",
title = "Hierarchical Status Clusters by Year") +
theme_minimal()
# Create a wide format dataset to analyze status changes
status_over_time <- integrated_status %>%
select(country, year, combined_status_score, status_type) %>%
pivot_wider(
names_from = year,
values_from = c(combined_status_score, status_type),
names_sep = "_"
)
# Calculate status change for countries present in multiple years
status_change <- integrated_status %>%
select(country, year, combined_status_score) %>%
arrange(country, year) %>%
group_by(country) %>%
mutate(
prev_year = lag(year),
prev_score = lag(combined_status_score),
status_change = combined_status_score - prev_score
) %>%
filter(!is.na(status_change)) %>%
ungroup()
# Visualize status changes
ggplot(status_change, aes(x = year, y = status_change, group = country)) +
geom_line(alpha = 0.3) +
geom_point(alpha = 0.3) +
geom_text_repel(data = subset(status_change,
abs(status_change) > quantile(abs(status_change), 0.9, na.rm = TRUE)),
aes(label = country), max.overlaps = 15) +
labs(x = "Year", y = "Status Change",
title = "Status Mobility Over Time") +
theme_minimal()
# Export the final integrated dataset
write.csv(integrated_status, "integrated_status_dimensions.csv", row.names = FALSE)